Species: Explore

Get Species Observations

obs_csv <- file.path(dir_data, "obs.csv")
obs_geo <- file.path(dir_data, "obs.geojson")
redo <- F

# get species occurrence data from GBIF with coordinates
if(!file.exists(obs_geo) | redo){
  (res <- spocc::occ(
  query = 'Phoenicopterus ruber', 
  from = 'gbif', 
  has_coords = T, 
  limit = 10000
  ))
  
  df <- res$gbif$data[[1]] 
  nrow(df) # number of rows
  readr::write_csv(df, obs_csv)
  
  obs <- df %>% 
    sf::st_as_sf(
      coords = c("longitude", "latitude"),
      crs = st_crs(4326)) %>% 
    select(prov, key) # save space (joinable from obs_csv)
  sf::write_sf(obs, obs_geo, delete_dsn=T)
}
obs <- sf::read_sf(obs_geo)
nrow(obs)
## [1] 10000
# show points on map
mapview::mapview(obs, map.types = "OpenTopoMap")

Get Environmental Data

Presence

dir_env <- file.path(dir_data, "env")

# set a default data directory
options(sdmpredictors_datadir = dir_env)

# choosing marine
env_datasets <- sdmpredictors::list_datasets(terrestrial = T, marine = F)

# show table of datasets
env_datasets %>% 
  select(dataset_code, description, citation) %>% 
  DT::datatable()
# choose datasets for a vector
env_datasets_vec <- c("WorldClim", "ENVIREM")

# get layers
env_layers <- sdmpredictors::list_layers(env_datasets_vec)
DT::datatable(env_layers)
# choose layers after some inspection and perhaps consulting literature
env_layers_vec <- c("WC_alt", "WC_bio1", "WC_bio2", "ER_tri", "ER_topoWet")

# get layers
env_stack <- load_layers(env_layers_vec)

# interactive plot layers, hiding all but first (select others)
plot(env_stack, nc=2)

obs_hull_geo <- file.path(dir_data, "obs_hull.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")

if (!file.exists(obs_hull_geo) | redo){
  # make convex hull around points of observation
  obs_hull <- sf::st_convex_hull(st_union(obs))
  
  # save obs hull
  write_sf(obs_hull, obs_hull_geo)
}
obs_hull <- read_sf(obs_hull_geo)

# show points on map
mapview(
  list(obs, obs_hull))
if (!file.exists(env_stack_grd) | redo){
  obs_hull_sp <- sf::as_Spatial(obs_hull)
  env_stack <- raster::mask(env_stack, obs_hull_sp) %>% 
    raster::crop(extent(obs_hull_sp))
  writeRaster(env_stack, env_stack_grd, overwrite=T)  
}
env_stack <- stack(env_stack_grd)

# show map
# mapview(obs) + 
#   mapview(env_stack, hide = T) # makes html too big for Github
plot(env_stack, nc=2)

Pseudo-Absence

absence_geo <- file.path(dir_data, "absence.geojson")
pts_geo     <- file.path(dir_data, "pts.geojson")
pts_env_csv <- file.path(dir_data, "pts_env.csv")

if (!file.exists(absence_geo) | redo){
  # get raster count of observations
  r_obs <- rasterize(
    sf::as_Spatial(obs), env_stack[[1]], field=1, fun='count')
  
  # show map
  # mapview(obs) + 
  #   mapview(r_obs)
  
  # create mask for 
  r_mask <- mask(env_stack[[1]] > -Inf, r_obs, inverse=T)
  
  # generate random points inside mask
  absence <- dismo::randomPoints(r_mask, nrow(obs)) %>% 
    as_tibble() %>% 
    st_as_sf(coords = c("x", "y"), crs = 4326)
  
  write_sf(absence, absence_geo, delete_dsn=T)
}
absence <- read_sf(absence_geo)

# show map of presence, ie obs, and absence
mapview(obs, col.regions = "green") + 
  mapview(absence, col.regions = "gray")
## Warning in cbind(`Feature ID` = fid, mat): number of rows of result is not a
## multiple of vector length (arg 1)
if (!file.exists(pts_env_csv) | redo){

  # combine presence and absence into single set of labeled points 
  pts <- rbind(
    obs %>% 
      mutate(
        present = 1) %>% 
      select(present, key),
    absence %>% 
      mutate(
        present = 0,
        key     = NA)) %>% 
    mutate(
      ID = 1:n()) %>% 
    relocate(ID)
  write_sf(pts, pts_geo, delete_dsn=T)

  # extract raster values for points
  pts_env <- raster::extract(env_stack, as_Spatial(pts), df=TRUE) %>% 
    tibble() %>% 
    # join present and geometry columns to raster value results for points
    left_join(
      pts %>% 
        select(ID, present),
      by = "ID") %>% 
    relocate(present, .after = ID) %>% 
    # extract lon, lat as single columns
    mutate(
      #present = factor(present),
      lon = st_coordinates(geometry)[,1],
      lat = st_coordinates(geometry)[,2]) %>% 
    select(-geometry)
  write_csv(pts_env, pts_env_csv)
}
pts_env <- read_csv(pts_env_csv)

pts_env %>% 
  # show first 10 presence, last 10 absence
  slice(c(1:10, (nrow(pts_env)-9):nrow(pts_env))) %>% 
  DT::datatable(
    rownames = F,
    options = list(
      dom = "t",
      pageLength = 20))

Term Plots

pts_env %>% 
  select(-ID) %>% 
  mutate(
    present = factor(present)) %>% 
  pivot_longer(-present) %>% 
  ggplot() +
  geom_density(aes(x = value, fill = present)) + 
  scale_fill_manual(values = alpha(c("gray", "green"), 0.5)) +
  scale_x_continuous(expand=c(0,0)) +
  scale_y_continuous(expand=c(0,0)) +
  theme_bw() + 
  facet_wrap(~name, scales = "free") +
  theme(
    legend.position = c(1, 0),
    legend.justification = c(1, 0))
## Warning: Removed 1044 rows containing non-finite values (stat_density).

Species: Regress

librarian::shelf(
  DT, dplyr, dismo, GGally, here, readr, tidyr)
## 
##   The 'cran_repo' argument in shelf() was not set, so it will use
##   cran_repo = 'https://cran.r-project.org' by default.
## 
##   To avoid this message, set the 'cran_repo' argument to a CRAN
##   mirror URL (see https://cran.r-project.org/mirrors.html) or set
##   'quiet = TRUE'.
select <- dplyr::select # overwrite raster::select
options(readr.show_col_types = F)

dir_data    <- here("data/sdm")
pts_env_csv <- file.path(dir_data, "pts_env.csv")

pts_env <- read_csv(pts_env_csv)
nrow(pts_env)
## [1] 20000
datatable(pts_env, rownames = F)
GGally::ggpairs(
  select(pts_env, -ID),
  aes(color = factor(present)))
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 197 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 197 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 197 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 301 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 152 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## Warning: Removed 197 rows containing missing values (geom_point).
## Warning: Removed 197 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 197 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 197 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 332 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 201 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 197 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 197 rows containing missing values
## Warning: Removed 197 rows containing missing values (geom_point).

## Warning: Removed 197 rows containing missing values (geom_point).
## Warning: Removed 197 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 197 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 332 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 201 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 197 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 197 rows containing missing values
## Warning: Removed 197 rows containing missing values (geom_point).

## Warning: Removed 197 rows containing missing values (geom_point).

## Warning: Removed 197 rows containing missing values (geom_point).
## Warning: Removed 197 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 332 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 201 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 197 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 197 rows containing missing values
## Warning: Removed 301 rows containing missing values (geom_point).
## Warning: Removed 332 rows containing missing values (geom_point).

## Warning: Removed 332 rows containing missing values (geom_point).

## Warning: Removed 332 rows containing missing values (geom_point).
## Warning: Removed 301 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 303 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 301 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 301 rows containing missing values
## Warning: Removed 152 rows containing missing values (geom_point).
## Warning: Removed 201 rows containing missing values (geom_point).

## Warning: Removed 201 rows containing missing values (geom_point).

## Warning: Removed 201 rows containing missing values (geom_point).
## Warning: Removed 303 rows containing missing values (geom_point).
## Warning: Removed 152 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 152 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 152 rows containing missing values
## Warning: Removed 197 rows containing missing values (geom_point).

## Warning: Removed 197 rows containing missing values (geom_point).

## Warning: Removed 197 rows containing missing values (geom_point).
## Warning: Removed 301 rows containing missing values (geom_point).
## Warning: Removed 152 rows containing missing values (geom_point).
## Warning: Removed 197 rows containing missing values (geom_point).

## Warning: Removed 197 rows containing missing values (geom_point).

## Warning: Removed 197 rows containing missing values (geom_point).
## Warning: Removed 301 rows containing missing values (geom_point).
## Warning: Removed 152 rows containing missing values (geom_point).

Logistic Regression

Setup Data

# setup model data
d <- pts_env %>% 
  select(-ID) %>%  # remove terms we don't want to model
  tidyr::drop_na() # drop the rows with NA values
nrow(d)
## [1] 19668

Linear Model

# fit a linear model
mdl <- lm(present ~ ., data = d)
summary(mdl)
## 
## Call:
## lm(formula = present ~ ., data = d)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.07599 -0.31329  0.07745  0.34631  1.44589 
## 
## Coefficients:
##                 Estimate   Std. Error t value            Pr(>|t|)    
## (Intercept)  1.570404107  0.046298365  33.919 <0.0000000000000002 ***
## WC_alt      -0.000077068  0.000007708  -9.998 <0.0000000000000002 ***
## WC_bio1     -0.039247738  0.000720435 -54.478 <0.0000000000000002 ***
## WC_bio2     -0.051579810  0.001299683 -39.686 <0.0000000000000002 ***
## ER_tri      -0.004271455  0.000224818 -19.000 <0.0000000000000002 ***
## ER_topoWet   0.033495629  0.003747848   8.937 <0.0000000000000002 ***
## lon         -0.004265912  0.000081128 -52.583 <0.0000000000000002 ***
## lat         -0.008736233  0.000171005 -51.087 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3898 on 19660 degrees of freedom
## Multiple R-squared:  0.3923, Adjusted R-squared:  0.3921 
## F-statistic:  1813 on 7 and 19660 DF,  p-value: < 0.00000000000000022
y_predict <- predict(mdl, pts_env, type="response")
y_true    <- pts_env$present

range(y_predict)
## [1] NA NA
range(y_true)
## [1] 0 1

Generalized Linear Model

# fit a generalized linear model with a binomial logit link function
mdl <- glm(present ~ ., family = binomial(link="logit"), data = d)
summary(mdl)
## 
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"), 
##     data = d)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8932  -0.6316  -0.0778   0.8326   3.7967  
## 
## Coefficients:
##                Estimate  Std. Error z value             Pr(>|z|)    
## (Intercept)  8.95095496  0.37591201  23.811 < 0.0000000000000002 ***
## WC_alt      -0.00024986  0.00005422  -4.608       0.000004056799 ***
## WC_bio1     -0.30131529  0.00640733 -47.027 < 0.0000000000000002 ***
## WC_bio2     -0.41142538  0.01062817 -38.711 < 0.0000000000000002 ***
## ER_tri      -0.03652296  0.00218994 -16.678 < 0.0000000000000002 ***
## ER_topoWet   0.18605164  0.02954437   6.297       0.000000000303 ***
## lon         -0.03068770  0.00070107 -43.773 < 0.0000000000000002 ***
## lat         -0.06701779  0.00148996 -44.980 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27261  on 19667  degrees of freedom
## Residual deviance: 17253  on 19660  degrees of freedom
## AIC: 17269
## 
## Number of Fisher Scoring iterations: 6
y_predict <- predict(mdl, d, type="response")

range(y_predict)
## [1] 0.0001601382 0.9910803961
# show term plots
termplot(mdl, partial.resid = TRUE, se = TRUE, main = F)

Generalized Additive Model

librarian::shelf(mgcv)
## 
##   The 'cran_repo' argument in shelf() was not set, so it will use
##   cran_repo = 'https://cran.r-project.org' by default.
## 
##   To avoid this message, set the 'cran_repo' argument to a CRAN
##   mirror URL (see https://cran.r-project.org/mirrors.html) or set
##   'quiet = TRUE'.
# fit a generalized additive model with smooth predictors
mdl <- mgcv::gam(
  formula = present ~ s(WC_alt) + s(WC_bio1) + 
    s(WC_bio2) + s(ER_tri) + s(ER_topoWet) + s(lon) + s(lat), 
  family = binomial, data = d)
summary(mdl)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio2) + s(ER_tri) + s(ER_topoWet) + 
##     s(lon) + s(lat)
## 
## Parametric coefficients:
##             Estimate Std. Error z value            Pr(>|z|)    
## (Intercept)  -2.7728     0.2715  -10.21 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                 edf Ref.df  Chi.sq             p-value    
## s(WC_alt)     8.870  8.983  692.12 <0.0000000000000002 ***
## s(WC_bio1)    7.632  8.178  463.75 <0.0000000000000002 ***
## s(WC_bio2)    8.917  8.997  366.79 <0.0000000000000002 ***
## s(ER_tri)     7.447  7.756   83.36 <0.0000000000000002 ***
## s(ER_topoWet) 7.237  8.130   55.26 <0.0000000000000002 ***
## s(lon)        8.997  9.000  632.41 <0.0000000000000002 ***
## s(lat)        8.878  8.989 1022.83 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.868   Deviance explained = 81.9%
## UBRE = -0.74255  Scale est. = 1         n = 19668
# show term plots
plot(mdl, scale=0)

Maximum Entropy

Maxent is probably the most commonly used species distrbution model since it performs well with few input data points, only reuires presence points and is easy to use with a Java graphical user interface (GUI).

# load extra packages
librarian::shelf(
  maptools, sf)
## 
##   The 'cran_repo' argument in shelf() was not set, so it will use
##   cran_repo = 'https://cran.r-project.org' by default.
## 
##   To avoid this message, set the 'cran_repo' argument to a CRAN
##   mirror URL (see https://cran.r-project.org/mirrors.html) or set
##   'quiet = TRUE'.
# show version of maxent
maxent()
## This is MaxEnt version 3.4.3
# get environmental rasters
# NOTE: the first part of Lab 1. SDM - Explore got updated to write this clipped environmental raster stack
env_stack_grd <- file.path(dir_data, "env_stack.grd")
env_stack <- stack(env_stack_grd)
plot(env_stack, nc=2)

# get presence-only observation points (maxent extracts raster values for you)
obs_geo <- file.path(dir_data, "obs.geojson")
obs_sp <- read_sf(obs_geo) %>% 
  sf::as_Spatial() # maxent prefers sp::SpatialPoints over newer sf::sf class

# fit a maximum entropy model
mdl <- maxent(env_stack, obs_sp)
## Warning in .local(x, p, ...): 72 (7.42%) of the presence points have NA
## predictor values
## This is MaxEnt version 3.4.3
plot(mdl)

# plot term plots
response(mdl)

# predict
y_predict <- predict(env_stack, mdl) #, ext=ext, progress='')

plot(y_predict, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')

Species: Trees

# read data
pts_env <- read_csv(pts_env_csv)
d <- pts_env %>% 
  select(-ID) %>%                   # not used as a predictor x
  mutate(
    present = factor(present)) %>%  # categorical response
  na.omit()                         # drop rows with NA
skim(d)
Data summary
Name d
Number of rows 19668
Number of columns 8
_______________________
Column type frequency:
factor 1
numeric 7
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
present 0 1 FALSE 2 0: 9979, 1: 9689

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
WC_alt 0 1 414.38 545.31 -38.00 13.00 143.50 612.25 3995.00 ▇▂▁▁▁
WC_bio1 0 1 22.47 4.62 -0.90 18.40 24.10 26.30 30.60 ▁▁▃▅▇
WC_bio2 0 1 11.51 3.24 4.90 9.20 11.30 14.20 20.40 ▃▇▇▆▁
ER_tri 0 1 14.38 21.94 0.00 2.74 7.42 15.56 295.78 ▇▁▁▁▁
ER_topoWet 0 1 12.24 1.41 7.00 11.36 12.41 13.34 15.80 ▁▂▆▇▁
lon 0 1 -27.00 49.06 -117.14 -76.29 -7.71 19.12 40.04 ▃▆▁▂▇
lat 0 1 3.90 22.04 -34.71 -13.96 10.60 21.32 52.05 ▆▃▆▇▁

Split data into training and testing

# create training set with 80% of full data
d_split  <- rsample::initial_split(d, prop = 0.8, strata = "present")
d_train  <- rsample::training(d_split)

# show number of rows present is 0 vs 1
table(d$present)
## 
##    0    1 
## 9979 9689
table(d_train$present)
## 
##    0    1 
## 7983 7751

Decision Trees

Partition, depth = 1

# run decision stump model
mdl <- rpart(
  present ~ ., data = d_train, 
  control = list(
    cp = 0, minbucket = 5, maxdepth = 1))
mdl
## n= 15734 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 15734 7751 0 (0.50737257 0.49262743)  
##   2) WC_alt>=47.5 9320 1866 0 (0.79978541 0.20021459) *
##   3) WC_alt< 47.5 6414  529 1 (0.08247583 0.91752417) *
# plot tree 
par(mar = c(1, 1, 1, 1))
rpart.plot(mdl)

Partition, depth=default

# decision tree with defaults
mdl <- rpart(present ~ ., data = d_train)
mdl
## n= 15734 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 15734 7751 0 (0.50737257 0.49262743)  
##   2) WC_alt>=47.5 9320 1866 0 (0.79978541 0.20021459)  
##     4) lat>=-24.99979 7538  353 0 (0.95317060 0.04682940) *
##     5) lat< -24.99979 1782  269 1 (0.15095398 0.84904602) *
##   3) WC_alt< 47.5 6414  529 1 (0.08247583 0.91752417)  
##     6) lat>=30.20225 131   19 0 (0.85496183 0.14503817) *
##     7) lat< 30.20225 6283  417 1 (0.06636957 0.93363043) *
rpart.plot(mdl)

# plot complexity parameter
plotcp(mdl)

# rpart cross validation results
mdl$cptable
##           CP nsplit rel error    xerror        xstd
## 1 0.69100761      0 1.0000000 1.0000000 0.008090673
## 2 0.16049542      1 0.3089924 0.3089924 0.005813492
## 3 0.01199845      2 0.1484970 0.1492711 0.004223995
## 4 0.01000000      3 0.1364985 0.1372726 0.004063578

Feature interpretation

# caret cross validation results
mdl_caret <- train(
  present ~ .,
  data       = d_train,
  method     = "rpart",
  trControl  = trainControl(method = "cv", number = 10),
  tuneLength = 20)

ggplot(mdl_caret)

vip(mdl_caret, num_features = 40, bar = FALSE)

# Construct partial dependence plots
p1 <- partial(mdl_caret, pred.var = "lat") %>% autoplot()
p2 <- partial(mdl_caret, pred.var = "WC_bio2") %>% autoplot()
p3 <- partial(mdl_caret, pred.var = c("lat", "WC_bio2")) %>% 
  plotPartial(levelplot = FALSE, zlab = "yhat", drape = TRUE, 
              colorkey = TRUE, screen = list(z = -20, x = -60))

# Display plots side by side
gridExtra::grid.arrange(p1, p2, p3, ncol = 3)
## Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
## Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]`
## instead.
## Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
## Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]`
## instead.

Random Forests

Fit

# number of features
n_features <- length(setdiff(names(d_train), "present"))

# fit a default random forest model
mdl_rf <- ranger(present ~ ., data = d_train)

# get out of the box RMSE
(default_rmse <- sqrt(mdl_rf$prediction.error))
## [1] 0.1090188

Feature Interpretation

# re-run model with impurity-based variable importance
mdl_impurity <- ranger(
  present ~ ., data = d_train,
  importance = "impurity")

# re-run model with permutation-based variable importance
mdl_permutation <- ranger(
  present ~ ., data = d_train,
  importance = "permutation")
p1 <- vip::vip(mdl_impurity, bar = FALSE)
p2 <- vip::vip(mdl_permutation, bar = FALSE)

gridExtra::grid.arrange(p1, p2, nrow = 1)